% load_ext rpy2.ipython
import rpy2.robjects as robj
%%R
N_iter=20
y=rep(NA,N_iter)
set.seed(123)
for(i in 1:N_iter){
cat(i,"\r")
rands=rnorm(2^i)
y[i]=max(rands)
}
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
%%R
plot(1:N_iter, y, type="l")
%%R
grades=read.csv("grades.csv")
grades$X=NULL
head(grades)
meangrades=c()
for (i in 1:nrow(grades)){
meangrades[i]=mean(as.matrix(grades[i,2:ncol(grades)]))
}
head(meangrades)
[1] 49.25 59.00 44.00 50.00 55.75 56.75
%%R
meangrades = apply (grades[,2:ncol(grades)],1,mean)
head(meangrades)
[1] 49.25 59.00 44.00 50.00 55.75 56.75
import rpy2.robjects as robj
from rpy2.robjects import r # R instance
import numpy as np
import pandas as pd
r["meangrades"]
49.250000 | 59.000000 | 44.000000 | 50.000000 | ... | 41.500000 | 51.250000 | 59.750000 | 61.000000 |
type(r["meangrades"])
rpy2.robjects.vectors.FloatVector
meangrades_np = np.array(r["meangrades"])
meangrades_np[:10]
array([49.25, 59. , 44. , 50. , 55.75, 56.75, 53.75, 41.5 , 55. , 52. ])
r["meangrades_back"]
--------------------------------------------------------------------------- LookupError Traceback (most recent call last) <ipython-input-62-d770a250cbb4> in <module>() ----> 1 r["meangrades_back"] /usr/local/lib/python3.6/dist-packages/rpy2/robjects/__init__.py in __getitem__(self, item) 329 330 def __getitem__(self, item): --> 331 res = _globalenv.get(item) 332 res = conversion.ri2py(res) 333 if hasattr(res, '__rname__'): LookupError: 'meangrades_back' not found
meangrades_back = robj.Vector(meangrades_np)
_ = r(f"meangrades_back = {meangrades_back.r_repr()}")
r["meangrades_back"][:10]
array([49.25, 59. , 44. , 50. , 55.75, 56.75, 53.75, 41.5 , 55. , 52. ])
from rpy2.robjects import pandas2ri
pandas2ri.activate()
grades = pd.read_csv("grades.csv", index_col=0)
grades.head()
id | write | math | science | socst | |
---|---|---|---|---|---|
1 | 70 | 52 | 41 | 47 | 57 |
2 | 121 | 59 | 53 | 63 | 61 |
3 | 86 | 33 | 54 | 58 | 31 |
4 | 141 | 44 | 47 | 53 | 56 |
5 | 172 | 52 | 57 | 53 | 61 |
r_dataframe = pandas2ri.py2ri(grades)
_ = r(f"grades_back = {r_dataframe.r_repr()}")
r["grades_back"].head()
id | write | math | science | socst | |
---|---|---|---|---|---|
0 | 70 | 52 | 41 | 47 | 57 |
1 | 121 | 59 | 53 | 63 | 61 |
2 | 86 | 33 | 54 | 58 | 31 |
3 | 141 | 44 | 47 | 53 | 56 |
4 | 172 | 52 | 57 | 53 | 61 |
%%R
grades$Filt=NA
head(grades)
id write math science socst Filt 1 70 52 41 47 57 NA 2 121 59 53 63 61 NA 3 86 33 54 58 31 NA 4 141 44 47 53 56 NA 5 172 52 57 53 61 NA 6 113 52 51 63 61 NA
%%R
for (i in 1:nrow(grades)){
if (grades$write[i] > 50){
grades$Filt[i] = "A"
}
else if (grades$write[i] > 40 & grades$write[i] <= 50){
grades$Filt[i] = "B"
}
else {
grades$Filt[i] = "C"
}
}
head(grades)
id write math science socst Filt 1 70 52 41 47 57 A 2 121 59 53 63 61 A 3 86 33 54 58 31 C 4 141 44 47 53 56 B 5 172 52 57 53 61 A 6 113 52 51 63 61 A
%%R
x = c(-1,4,-5,2,7)
x
ifelse(x > 0, "pos","neg")
[1] "neg" "pos" "neg" "pos" "pos"
%%R
My_Func = function (x, y=2){
x ^ y
}
%%R
My_Func(x)
[1] 1 16 25 4 49
%%R
My_Func(2)
[1] 4
%%R
My_Func(2,4)
[1] 16
%%R
l = list(1, c(1,2,3), c(3,4))
l
[[1]] [1] 1 [[2]] [1] 1 2 3 [[3]] [1] 3 4
%%R
lapply(l, sum)
[[1]] [1] 1 [[2]] [1] 6 [[3]] [1] 7
%%R
sapply(l,sum)
[1] 1 6 7
%%R
lapply(l, function(x){return(c(min(x), max(x)))})
[[1]] [1] 1 1 [[2]] [1] 1 3 [[3]] [1] 3 4
%%R
sapply(l, function(x){return(c(min(x), max(x)))})
[,1] [,2] [,3] [1,] 1 1 3 [2,] 1 3 4
# import rpy2's package module
import rpy2.robjects.packages as rpackages
# import R's utility package
utils = rpackages.importr('utils')
# select a mirror for R packages
utils.chooseCRANmirror(ind=1) # select the first mirror in the list
rpy2.rinterface.NULL
# R package names
packnames = ('reshape2', 'ggplot2')
# R vector of strings
from rpy2.robjects.vectors import StrVector
# Selectively install what needs to be install.
# We are fancy, just because we can.
names_to_install = [x for x in packnames if not rpackages.isinstalled(x)]
if len(names_to_install) > 0:
utils.install_packages(StrVector(names_to_install))
reshape = rpackages.importr('reshape2')
%%R
a=data.frame(name=c('John', 'Mary', 'Peter', 'Susan'),
sex=c('m','f','m','f'),
age=c(26,21,19,29),
weight=c(82, 56, 79, 60),
height=c(182, 171, 179, 175))
a
name sex age weight height 1 John m 26 82 182 2 Mary f 21 56 171 3 Peter m 19 79 179 4 Susan f 29 60 175
%%R
a_melt = melt (a, id.vars = c('name', 'sex'),
variable_name = "a_var", value.name = 'a_name')
a_melt
name sex variable a_name 1 John m age 26 2 Mary f age 21 3 Peter m age 19 4 Susan f age 29 5 John m weight 82 6 Mary f weight 56 7 Peter m weight 79 8 Susan f weight 60 9 John m height 182 10 Mary f height 171 11 Peter m height 179 12 Susan f height 175
%%R
dcast(a_melt, name ~ a_var)
dcast(a_melt, name + sex ~ a_var)
Error in FUN(X[[i]], ...) : object 'a_var' not found
df = r["a"]
type(df) # pandas2ri converted it automatically
pandas.core.frame.DataFrame
df
name | sex | age | weight | height | |
---|---|---|---|---|---|
0 | John | m | 26.0 | 82.0 | 182.0 |
1 | Mary | f | 21.0 | 56.0 | 171.0 |
2 | Peter | m | 19.0 | 79.0 | 179.0 |
3 | Susan | f | 29.0 | 60.0 | 175.0 |
molten = df.melt(id_vars=["name", "sex"],
var_name="a_var", value_name="a_name")
molten
name | sex | a_var | a_name | |
---|---|---|---|---|
0 | John | m | age | 26.0 |
1 | Mary | f | age | 21.0 |
2 | Peter | m | age | 19.0 |
3 | Susan | f | age | 29.0 |
4 | John | m | weight | 82.0 |
5 | Mary | f | weight | 56.0 |
6 | Peter | m | weight | 79.0 |
7 | Susan | f | weight | 60.0 |
8 | John | m | height | 182.0 |
9 | Mary | f | height | 171.0 |
10 | Peter | m | height | 179.0 |
11 | Susan | f | height | 175.0 |
molten.pivot(columns="a_var", values="a_name", index="name")
a_var | age | height | weight |
---|---|---|---|
name | |||
John | 26.0 | 182.0 | 82.0 |
Mary | 21.0 | 171.0 | 56.0 |
Peter | 19.0 | 179.0 | 79.0 |
Susan | 29.0 | 175.0 | 60.0 |
trials = pd.read_hdf("nrdd_rephub_targets.hdf")
trials.sample(3)
Name | Indication | Phase | Therapeutic categories | rdkit_smiles | Targets | CID | |
---|---|---|---|---|---|---|---|
5442 | edaglitazone | non-insulin dependent diabetes | discontinued | {ENDOCRINE DRUGS} | Cc1oc(-c2ccccc2)nc1CCOc1ccc(CC2SC(=O)NC2=O)c2s... | NaN | 9825701 |
7117 | modafinil | shift work disorder (swd) | pre-registration and above | {NEUROLOGIC DRUGS, PSYCHOPHARMACOLOGIC DRUGS, ... | NC(=O)CS(=O)C(c1ccccc1)c1ccccc1 | {CYP2D6, CYP2C19, CYP3A4, Slc6a3, PTGS2, SLC6A... | 4236 |
2600 | il-16 | asthma | discontinued | {ANTIINFECTIVE THERAPY} | NN1C(=O)CC(c2cccc(Br)c2)C1=O | NaN | 125225 |
trials_gby = trials.groupby("Phase")
<pandas.core.groupby.DataFrameGroupBy object at 0x7fb19d436c88>
sizes = trials_gby.size().sort_values()
sizes
Phase phase 1 clinical 123 discovery 135 phase 3 clinical 208 phase 2 clinical 421 pre-registration and above 2961 discontinued 3434 dtype: int64
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="whitegrid", font_scale=1.6)
sns.barplot(sizes.index, sizes.values)
plt.xticks(rotation=45)
(array([0, 1, 2, 3, 4, 5]), <a list of 6 Text xticklabel objects>)
def size_apply(df):
return len(df)
trials_gby.apply(size_apply)
Phase discontinued 3434 discovery 135 phase 1 clinical 123 phase 2 clinical 421 phase 3 clinical 208 pre-registration and above 2961 dtype: int64
def size_agg(ser):
return len(ser)
trials_gby.agg(size_agg)
Name | Indication | Therapeutic categories | rdkit_smiles | Targets | CID | |
---|---|---|---|---|---|---|
Phase | ||||||
discontinued | 3434 | 3434 | 3434 | 3434 | 3434 | 3434 |
discovery | 135 | 135 | 135 | 135 | 135 | 135 |
phase 1 clinical | 123 | 123 | 123 | 123 | 123 | 123 |
phase 2 clinical | 421 | 421 | 421 | 421 | 421 | 421 |
phase 3 clinical | 208 | 208 | 208 | 208 | 208 | 208 |
pre-registration and above | 2961 | 2961 | 2961 | 2961 | 2961 | 2961 |
from random import choice
grades = pd.read_csv("grades.csv", index_col=0)
grades["favourite_color"] = [choice(["blue", "red",
"green", "hazelnut"])
for _ in grades.index]
grades_gby = grades.groupby("favourite_color")
grades_gby.agg(np.mean)
id | write | math | science | socst | |
---|---|---|---|---|---|
favourite_color | |||||
blue | 98.294118 | 50.784314 | 51.529412 | 49.294118 | 52.490196 |
green | 100.810345 | 54.586207 | 53.931034 | 52.137931 | 52.603448 |
hazelnut | 105.794872 | 51.128205 | 51.897436 | 53.282051 | 51.589744 |
red | 98.346154 | 53.942308 | 52.865385 | 52.961538 | 52.711538 |